In the models below, I use two primary ensembling techniques. The first, algorithm-based ensembling, rebalances the data to augment foreclosure predictions. The second uses vote ensembling to increase generalization of each model’s results.
Models were run on datasets containing an individual bank and a dataset of all 9 banks (for a total of 10 datasets).
# Load functions
%run Functions.ipynb
pd.set_option("display.max_columns", 200)
pd.set_option('display.max_rows', 200)
# Load data
file_to_open = open('..\Data\Pickle\df.pkl', 'rb')
df = pickle.load(file_to_open)
file_to_open.close()
# Drop mergeID column
df = df.drop(labels='Loan ID', axis=1)
# Convert Inf values to NA
df = df.replace([np.inf, -np.inf], np.nan)
# Verify Bank Counts
df['Bank'].value_counts()
# Variables to drop
dropvars = ['File Year', 'Year', 'Month', 'Region', 'FIPS', # 'Reported Period',
'Zip Code', 'Mortgage Insurance Type', 'Property State',
'First Payment', 'Original Loan-to-Value (LTV)']
df = df.drop(labels=dropvars, axis=1)
df = df.filter(regex=r'^(?!Asset).*$')
df = df.filter(regex=r'^(?!Liab).*$')
df = df.filter(regex=r'^(?!Eqtot).*$')
df = df.filter(regex=r'^(?!Dep).*$')
# Convert Original Date to Numeric
df['Reported Period'] = df['Reported Period'].astype(float).astype(int).astype(str)
df['Reported Period'] = df['Reported Period'].apply(lambda x: x.zfill(6))
df['Reported Period'] = df['Reported Period'].map(lambda x: x[:2] + '/' + x[2:])
df = change_date(df, 'Reported Period')
df = change_date(df, 'Original Date')
# Missingness to drop
df = df.dropna()
# All data
y_all = df['Foreclosed']
X_all = df.drop(labels=['Foreclosed', 'Zero Balance Code'], axis=1)
# Split Train (70%)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.7,
stratify = y_all, random_state=2019)
# Split Val (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5,
stratify = y_test, random_state=2019)
# One hot encoding on remaining data
Bnk_train = X_train['Bank'].reset_index().iloc[:,1]
X_train = onehotencoding(X_train)
Bnk_val = X_val['Bank'].reset_index().iloc[:,1]
X_val = onehotencoding(X_val)
Bnk_test = X_test['Bank'].reset_index().iloc[:,1]
X_test = onehotencoding(X_test)
# Update Macroeconomic variables (will not use test set)
X_train, X_val, X_test = pca_fred(X_train, X_val, X_test)
# Check columns
X_train.columns
# List of banks
banks = ['Bank of America','Wells Fargo Bank','CitiMortgage',
'JPMorgan Chase','GMAC Mortgage','SunTrust Mortgage',
'AmTrust Bank','PNC Bank','Flagstar Bank']
# Run Function
Banks_X, Banks_y = Bank_Subsets(banks, df_X = X_train, df_y = y_train)
Banks_X_val, Banks_y_val = Bank_Subsets(banks, df_X = X_val, df_y = y_val)
Banks_X_test, Banks_y_test = Bank_Subsets(banks, df_X = X_test, df_y = y_test)
X_train = X_train.filter(regex=r'^(?!Bank).*$')
X_val = X_val.filter(regex=r'^(?!Bank).*$')
X_test = X_test.filter(regex=r'^(?!Bank).*$')
print('Shape:', X_train.shape)
# Run bottom layer models (Ind banks/all banks)
## Bank and Classifier Lists
banks_plus = banks + ['All Banks']
clfs = [rfc1, rfc2, rus]
clfs_str = ['RFC', 'RFC PCA', 'RUS Boost']
## Save full data
Banks_y['All Banks'] = y_train
Banks_X['All Banks'] = X_train
Banks_y_val['All Banks'] = y_val
Banks_X_val['All Banks'] = X_val
Banks_y_test['All Banks'] = y_test
Banks_X_test['All Banks'] = X_test
## Run Classifier Function
vote_models, \
vote_thresholds, \
vote_proba, \
vote_pred = clf_pred_func(bnk_list = banks_plus, clfs = clfs, clfs_str = clfs_str, \
Banks_y = Banks_y, Banks_X = Banks_X, \
Banks_y_val = Banks_y_val, Banks_X_val = Banks_X_val, \
Banks_y_test = Banks_y_test, Banks_X_test = Banks_X_test)
# Vote classifier
## Create voting dataframes
votes = votes_clf_func(vote_pred = vote_pred, bnk_list = banks_plus,
clfs_str = clfs_str, X = Bnk_test)
combined_votes = pd.Series()
combined_actuals = pd.Series()
# Middle layer
for bank_str in banks:
print(bank_str)
all_bnks_pred = ( votes['All Banks'].iloc[:,:len(clfs_str)].sum(axis=1) /
len(clfs_str) ) \
.map(lambda x: 1 if x == 1.0 else 0)
votes[bank_str].loc[:,'All Banks'] = all_bnks_pred.loc[Bnk_test == bank_str].reset_index().iloc[:,1]
total_vote = votes[bank_str].loc[:,'All Banks']
votes[bank_str].loc[:,'All Banks 2'] = total_vote
print('Predicted Foreclosures', np.mean(total_vote).round(2))
print('Actual Foreclosures', np.mean(Banks_y_test[bank_str]).round(2))
print('F1 Score after Voting', f1_score(Banks_y_test[bank_str], total_vote).round(2))
print('Recall after Voting', recall_score(Banks_y_test[bank_str], total_vote).round(2))
print('Precision after Voting', precision_score(Banks_y_test[bank_str], total_vote).round(2))
print('')
# Combine banks
combined_votes = pd.concat([combined_votes, total_vote], axis=0)
combined_actuals = pd.concat([combined_actuals, Banks_y_test[bank_str]], axis=0)
print('Combined Predictions')
print('Predicted Foreclosures', np.mean(combined_votes).round(2))
print('Actual Foreclosures', np.mean(combined_actuals).round(2))
print('F1 Score after Voting', f1_score(combined_actuals, combined_votes).round(2))
print('Recall after Voting', recall_score(combined_actuals, combined_votes).round(2))
print('Precision after Voting', precision_score(combined_actuals, combined_votes).round(2))
combined_votes = pd.Series()
combined_actuals = pd.Series()
# Final vote
for bank_str in banks:
print(bank_str)
votes[bank_str].loc[:,'Majority'] = ( votes[bank_str].iloc[:,:(len(clfs_str)+1)].sum(axis=1) /
( len(clfs_str)+1 ) ) \
.map(lambda x: 1 if x > 0.67 else 0)
total_vote = votes[bank_str].loc[:,'Majority']
print('Predicted Foreclosures', np.mean(total_vote).round(2))
print('Actual Foreclosures', np.mean(Banks_y_test[bank_str]).round(2))
print('F1 Score after Voting', f1_score(Banks_y_test[bank_str], total_vote).round(2))
print('Recall after Voting', recall_score(Banks_y_test[bank_str], total_vote).round(2))
print('Precision after Voting', precision_score(Banks_y_test[bank_str], total_vote).round(2))
print('')
# Combine banks
combined_votes = pd.concat([combined_votes, total_vote], axis=0)
combined_actuals = pd.concat([combined_actuals, Banks_y_test[bank_str]], axis=0)
print('Combined Predictions')
print('Predicted Foreclosures', np.mean(combined_votes).round(2))
print('Actual Foreclosures', np.mean(combined_actuals).round(2))
print('F1 Score after Voting', f1_score(combined_actuals, combined_votes).round(2))
print('Recall after Voting', recall_score(combined_actuals, combined_votes).round(2))
print('Precision after Voting', precision_score(combined_actuals, combined_votes).round(2))
print('Accuracy after Voting', accuracy_score(combined_actuals, combined_votes).round(2))
print('')
print('Confusion Matrix')
print(confusion_matrix(combined_actuals, combined_votes))
# Models
file_to_store = open("..\Data\Pickle\models.pkl", "wb")
pickle.dump(vote_models, file_to_store)
file_to_store.close()
# Thresholds
file_to_store = open("..\Data\Pickle\model_thresholds.pkl", "wb")
pickle.dump(vote_thresholds, file_to_store)
file_to_store.close()
# Predictions
file_to_store = open("..\Data\Pickle\predictions.pkl", "wb")
pickle.dump(vote_pred, file_to_store)
file_to_store.close()
# Final Votes
file_to_store = open("..\Data\Pickle\df_votes.pkl", "wb")
pickle.dump(votes, file_to_store)
file_to_store.close()